;
;  Copyright (c) 2016, Alexey Frunze
;  2-clause BSD license.
;
bits 16

    extern ___start__
    extern __start__relot, __stop__relot
    extern __start__relod, __stop__relod
    extern __start__bss
    extern __stop__bss
    extern ___Irq5Isr

section .text

    global __start
__start:
    call    check_pmode

    ; Perform code and data relocations.
    ; Do this without using unreal (AKA big real) mode to make sure that
    ; it doesn't get disabled in the process by some driver or ISR.

    call    .labnext
.labnext:
    xor     ebx, ebx
    mov     bx, cs
    shl     ebx, 4
    xor     eax, eax
    pop     ax
    add     ebx, eax
    sub     ebx, .labnext ; ebx = base physical address

    ; Patch addresses recorded in .relod (.relod is generated by the linker).
    ; [Note that the .relot section (handled later) contains some of the same
    ; addresses contained in the .relod section and .relod must be processed
    ; before .relot. .relot is used to transform flat 32-bit addresses into
    ; far addresses of a form segment:offset and this can only be done when
    ; the address relocation (using .relod) has been done.]
    ; Note that the following loop patches addresses in both code and data,
    ; including the addresses in the two following instructions!
    mov     esi, __start__relod
    mov     ebp, __stop__relod
.relo_data_loop:
    cmp     esi, ebp
    jae     .relo_data_done

    lea     edi, [ebx + esi] ; edi = physical address of a relocation table element

    ror     edi, 4
    mov     ds, di
    shr     edi, 28

    mov     edi, [di]
    add     edi, ebx ; edi = physical address of a dword to which to add ebx

    ror     edi, 4
    mov     ds, di
    shr     edi, 28

    add     [di], ebx ; actual relocation

    add     esi, 4
    jmp     .relo_data_loop
.relo_data_done:

    ; Patch direct calls recorded in .relot (.relot is generated by the compiler
    ; and/or written by the programmer).
    mov     esi, __start__relot
    mov     ebp, __stop__relot
.relo_text_loop:
    cmp     esi, ebp
    jae     .relo_text_done

    mov     edi, esi ; edi = physical address of a relocation table element
    ror     edi, 4
    mov     ds, di
    shr     edi, 28

    mov     edi, [di] ; edi = address of an address which to transform into seg:ofs far address

    ror     edi, 4
    mov     ds, di
    shr     edi, 28

    mov     eax, [di]
    shl     eax, 12
    rol     ax, 4
    mov     [di], eax ; actual transformation

    add     esi, 4
    jmp     .relo_text_loop
.relo_text_done:

    ; Init .bss

    mov     edi, __start__bss
    mov     ebx, __stop__bss
    sub     ebx, edi
    ror     edi, 4
    mov     es, di
    shr     edi, 28
    xor     al, al
    cld

.bss1:
    mov     ecx, 32768
    cmp     ebx, ecx
    jc      .bss2

    sub     ebx, ecx
    rep     stosb
    and     di, 15
    mov     si, es
    add     si, 2048
    mov     es, si
    jmp     .bss1

.bss2:
    mov     cx, bx
    rep     stosb

    cli

    ; Save IRQ5/#GP ISR address and install our IRQ5/#GP ISR
    xor     ax, ax
    mov     ds, ax
    mov     ebx, ___pOldInt0xdIsr
    ror     ebx, 4
    mov     es, bx
    shr     ebx, 28
    db      0x66, 0xB8 ; mov eax, far address of __Int0xdIsr()
.patch_int0xdisr_addr:
    dd      ___Int0xdIsr
    xchg    eax, [0xd*4] ; exchange vector 0xd addresses in IVT
    mov     [es:bx], eax ; store the existing/old IRQ5 ISR address in ___pOldInt0xdIsr

    ; Call __setup_unreal()
    db      0x9A
;    db      0x66, 0xB8 ; mov eax, const
.patch_setup_unreal_addr:
    dd      ___setup_unreal

    ; We can now use flat 32-bit addresses with zero loaded into ds, es, fs, gs
    xor     ax, ax
    mov     ds, ax
    mov     es, ax

    sti

    ; Call __start__(), which will set up argc and argv for main() and call exit(main(argc, argv))
    db      0x9A
.patch_start_addr:
    dd      ___start__
    ; __start__() shouldn't return

check_pmode:
    ; Check CR0.PE bit
    smsw    ax
    and     ax, 1
    jnz     .fail
    ret
.fail:
    call    .get_msg_addr
    db      "The CPU is already in protected (virtual 8086) mode, can't set up unreal mode!",13,10,"$"
.get_msg_addr:
    pop     dx
    push    cs
    pop     ds
    mov     ah, 9
    int     0x21
    mov     ax, 0x4c01
    int     0x21

    global ___setup_unreal
___setup_unreal: ; far
    ; Set 4GB segment limits for ds, es, fs, gs
    pushfd                          ; we will preserve EFLAGS.IF
    push    eax
    push    ebx
    push    ds
    push    es
    push    fs
    push    gs

    ; Prepare a temporary GDT on the stack
    ; 32-bit 4GB data segment descriptor (selector 0x10)
    push    dword 0x00cf9200
    push    dword 0x0000ffff
    ; 16-bit 64KB code segment (starting at .prot) descriptor (selector 0x08)
    mov     eax, .prot
    rol     eax, 16
    push    word 0
    or      ah, 0x9a
    push    ax
    mov     ax, 0xffff
    push    eax
    ; NULL descriptor (selector 0x00)
    push    dword 0
    push    dword 0

    ; Load GDTR
    xor     eax, eax
    mov     ax, ss
    shl     eax, 4
    movzx   ebx, sp
    add     eax, ebx                ; eax = GDT address
    push    eax
    push    word 3*8-1              ; GDT size - 1
    lgdt    [ss:bx-6]               ; load the GDTR

    ; Enter protected mode
    cli                             ; disable interrupts
    mov     ebx, cr0
    inc     ebx
    mov     cr0, ebx
    ; jmp far 0x08:.prot
    jmp     0x08:0
.prot:

    ; Reload the segment registers to activate the new segment limits.
    ; We don't need ss to have a 4GB limit as the stack is still restricted to 64KB.
    mov     ax, 0x10
    mov     ds, ax
    mov     es, ax
    mov     fs, ax
    mov     gs, ax

    ; Leave protected mode
    dec     ebx
    mov     cr0, ebx
    ; jmp far .real_addr
    db      0xEA
.patch_real_addr:
    dd      .real_addr
.real_addr:
    add     sp, 3*8+6               ; remove GDT and GDTR from the stack

    ; Reload the segment registers to match the base address and the selector
    pop     gs
    pop     fs
    pop     es
    pop     ds
    pop     ebx
    pop     eax
    popfd                           ; restore interrupt "enabledness"
    retf ; far

    global ___Int0xdIsr
___Int0xdIsr:
    pushad
    push    ds
    push    es

    ; Check the depth of reentrancy.
    ; First enrance can be due to either IRQ5 or #GP.
    ; Second entrance can only be due to #GP.
    ; Rationale:
    ; 1. There can't be several IRQ5's handled in nested ISRs,
    ;    because this may overflow the stack, however,
    ;    there may be higher priority IRQs handled in nested
    ;    ISRs and ISRs may cause #GP if they make use of unreal mode.
    ; 2. #GP handler does not cause #GP's, nor enables interrupts.
    mov     ebx, Int0xdIsrDepth
    ror     ebx, 4
    mov     ds, bx
    shr     ebx, 28
    cmp     word [bx], 0
    jne     .gp

    ; First enrance, check for IRQ5.
    mov     al, 0x0B
    out     0x20, al                ; send OCW3 to PIC to read ISR
    in      al, 0x20                ; read ISR
    and     al, 0x20                ; check ISR for IRQ5
    jz      .gp

    ; Call __Irq5Isr(). It may cause #GP (second entrance).

    inc     word [bx]               ; increment Int0xdIsrDepth

    xor     ax, ax
    mov     ds, ax
    mov     es, ax                  ; ds=es=0 for 32-bit flat addresses in __Irq5Isr()

    db      0x9A
.patch_irq5isr_addr:
    dd      ___Irq5Isr
    ; Does __Irq5Isr() want us to call the old ISR?
    or      eax, eax
    jz      .irq5done

    ; Call the old ISR
    mov     ebx, ___pOldInt0xdIsr
    ror     ebx, 4
    mov     ds, bx
    shr     ebx, 28
    pushf
    call    far [bx]

.irq5done:
    mov     ebx, Int0xdIsrDepth
    ror     ebx, 4
    mov     ds, bx
    shr     ebx, 28
    dec     word [bx]               ; decrement Int0xdIsrDepth
    jmp     .done

.gp:
    ; Call __setup_unreal()
    db      0x9A
.patch_setup_unreal_addr2:
    dd      ___setup_unreal

;    mov     ebx, ___GpCnt
;    ror     ebx, 4
;    mov     ds, bx
;    shr     ebx, 28
;    inc     dword [bx]              ; increment ___GpCnt

.done:
    pop     es
    pop     ds
    popad
    iret

section .relot ; .relot must exist for __start__relot and __stop__relot to also exist
    dd      __start.patch_int0xdisr_addr ; patch the far address of __Int0xdIsr()
    dd      __start.patch_setup_unreal_addr ; patch the far call to __setup_unreal()
    dd      __start.patch_start_addr ; patch the far call to __start__()
    dd      ___setup_unreal.patch_real_addr ; patch the far jump to switch to real mode
    dd      ___Int0xdIsr.patch_irq5isr_addr ; patch the far call to __Irq5Isr()
    dd      ___Int0xdIsr.patch_setup_unreal_addr2 ; patch the far call to __setup_unreal()

;section .relod ; .relod must exist for __start__relod and __stop__relod to also exist
;               ; the linker will generate .relod for us

section .bss ; .bss must exist for __start__bss and __stop__bss to also exist
;    global ___GpCnt
;___GpCnt resd 1
    global ___pOldInt0xdIsr
___pOldInt0xdIsr resd 1
Int0xdIsrDepth resd 1
